In [ ]:
import numpy as np
import torch
from sklearn.datasets import fetch_california_housing
from torch.utils.data import TensorDataset, DataLoader
import pandas as pd
import matplotlib.pyplot as plt
from typing import Any
In [ ]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
cuda
In [ ]:
california_housing = fetch_california_housing(as_frame=True).frame
california_housing
Out[ ]:
MedInc HouseAge AveRooms AveBedrms Population AveOccup Latitude Longitude MedHouseVal
0 8.3252 41.0 6.984127 1.023810 322.0 2.555556 37.88 -122.23 4.526
1 8.3014 21.0 6.238137 0.971880 2401.0 2.109842 37.86 -122.22 3.585
2 7.2574 52.0 8.288136 1.073446 496.0 2.802260 37.85 -122.24 3.521
3 5.6431 52.0 5.817352 1.073059 558.0 2.547945 37.85 -122.25 3.413
4 3.8462 52.0 6.281853 1.081081 565.0 2.181467 37.85 -122.25 3.422
... ... ... ... ... ... ... ... ... ...
20635 1.5603 25.0 5.045455 1.133333 845.0 2.560606 39.48 -121.09 0.781
20636 2.5568 18.0 6.114035 1.315789 356.0 3.122807 39.49 -121.21 0.771
20637 1.7000 17.0 5.205543 1.120092 1007.0 2.325635 39.43 -121.22 0.923
20638 1.8672 18.0 5.329513 1.171920 741.0 2.123209 39.43 -121.32 0.847
20639 2.3886 16.0 5.254717 1.162264 1387.0 2.616981 39.37 -121.24 0.894

20640 rows × 9 columns

In [ ]:
data = (
    california_housing
    [["MedInc", "MedHouseVal"]]
    .assign(split=lambda df: np.random.RandomState(42).choice(["train", "test"], p=[.8, .2], size=df.shape[0]))
    .rename(columns={"MedInc": "X_orig", "MedHouseVal": "y_orig"})
    .assign(
        X=lambda df: (df["X_orig"] - df["X_orig"].mean()) / (df["X_orig"].std() + 1e-6),
        y=lambda df: (df["y_orig"] - df["y_orig"].mean()) / (df["y_orig"].std() + 1e-6)
    )
)
data
Out[ ]:
X_orig y_orig split X y
0 8.3252 4.526 train 2.344708 2.129578
1 8.3014 3.585 test 2.332180 1.314123
2 7.2574 3.521 train 1.782655 1.258662
3 5.6431 3.413 train 0.932944 1.165071
4 3.8462 3.422 train -0.012881 1.172870
... ... ... ... ... ...
20635 1.5603 0.781 train -1.216098 -1.115776
20636 2.5568 0.771 test -0.691576 -1.124442
20637 1.7000 0.923 train -1.142565 -0.992722
20638 1.8672 0.847 test -1.054557 -1.058582
20639 2.3886 0.894 train -0.780110 -1.017852

20640 rows × 5 columns

In [ ]:
X_train_orig_numpy = data[data["split"] == "train"]["X_orig"].to_numpy()
y_train_orig_numpy = data[data["split"] == "train"]["y_orig"].to_numpy()

X_train_numpy = data[data["split"] == "train"]["X"].to_numpy()
y_train_numpy = data[data["split"] == "train"]["y"].to_numpy()

X_test_numpy = data[data["split"] == "test"]["X"].to_numpy()
y_test_numpy = data[data["split"] == "test"]["y"].to_numpy()

X_test_orig_numpy = data[data["split"] == "test"]["X_orig"].to_numpy()
y_test_orig_numpy = data[data["split"] == "test"]["y_orig"].to_numpy()

X_train = torch.tensor(X_train_numpy.reshape(-1, 1), dtype=torch.float32, device=device)
y_train = torch.tensor(y_train_numpy.reshape(-1, 1), dtype=torch.float32, device=device)

X_test = torch.tensor(X_test_numpy.reshape(-1, 1), dtype=torch.float32, device=device)
y_test = torch.tensor(y_test_numpy.reshape(-1, 1), dtype=torch.float32, device=device)
In [ ]:
def viz_data():
    fig, (orig_ax, normalised_ax) = plt.subplots(nrows=2, figsize=(8, 8))
    orig_ax.hist(X_train_orig_numpy, bins=20, label="train", alpha=.5, density=True)
    orig_ax.hist(X_test_orig_numpy, bins=20, label="test", alpha=.5, density=True)

    orig_ax.set_xlabel("X_<split>_orig_numpy")
    orig_ax.legend()

    normalised_ax.hist(X_train_numpy, bins=20, label="train", alpha=.5, density=True)
    normalised_ax.hist(X_test_numpy, bins=20, label="test", alpha=.5, density=True)

    normalised_ax.set_xlabel("X_<split>_numpy")
    normalised_ax.legend()

viz_data()
In [ ]:
class MCDropout(torch.nn.Module):
  def __init__(self, p: float) -> None:
    self.p = p
    super().__init__()

  def forward(self, input: torch.Tensor) -> torch.Tensor:
    return torch.nn.functional.dropout(input, p=self.p, training=True)
In [ ]:
def create_net(net_description: dict[str, Any]) -> torch.nn.Module:
    def get_activation():
        act = net_description.get("activation", "gelu")
        if act == "gelu":
            return torch.nn.GELU()
        raise NotImplementedError(f"Unknown activation: {act}.")

    def get_dropout() -> list[torch.nn.Module]:
        dropout_p = net_description.get("dropout_p", .5)
        dropout_type = net_description.get("dropout_type", .5)

        if dropout_type == "regular":
            return [torch.nn.Dropout(p=dropout_p)]
    
        if dropout_type == "monte_carlo":
            return [MCDropout(p=dropout_p)]
        
        if dropout_type == "None":
            return []
    
        raise NotImplementedError(f"Unknown dropout: {dropout_type}.")

    num_hidden_neurons = net_description.get("num_hidden_neurons", 50)

    return torch.nn.Sequential(
        torch.nn.Linear(in_features=1, out_features=num_hidden_neurons, bias=True),
        get_activation(),
        # get_dropout(),
        torch.nn.Linear(in_features=num_hidden_neurons, out_features=num_hidden_neurons, bias=True),
        get_activation(),
        *get_dropout(),
        torch.nn.Linear(in_features=num_hidden_neurons, out_features=num_hidden_neurons, bias=True),
        get_activation(),
        *get_dropout(),
        torch.nn.Linear(in_features=num_hidden_neurons, out_features=1, bias=True),
        torch.nn.Sigmoid(),
        torch.nn.Linear(in_features=1, out_features=1, bias=True),
    ).to(device=device)
In [ ]:
def train_net(
    net_description: dict[str, Any],
    random_seed: int,
    num_epochs: int,
    batch_size: int,
    learning_rate: float,
    gamma: float,
    model_debug_data_save_frequency: int = 10
):
    torch.manual_seed(random_seed)
    net = create_net(net_description=net_description)
    optim = torch.optim.AdamW(params=net.parameters(), lr=learning_rate)
    scheduler = torch.optim.lr_scheduler.ExponentialLR(optim, gamma=gamma)

    train_dataset = TensorDataset(X_train, y_train)
    test_dataset = TensorDataset(X_test, y_test)

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    metrics = []

    for i_epoch in range(num_epochs):
        model_debug_data = None
        verbose = i_epoch % 50 == 0
        net.train()
        train_losses = []
        for batch_index, (input, target) in enumerate(train_loader):
            net.zero_grad()
            output = net.forward(input)
            loss = torch.nn.functional.mse_loss(output, target)

            loss.backward()

            optim.step()
            if batch_index == 0 and i_epoch % model_debug_data_save_frequency == 0:
                model_debug_data = {
                    "gradients": {
                        name: param.grad.detach().cpu().numpy()
                        for name, param in net.named_parameters()
                    },
                    "weights": {
                        name: param.detach().cpu().numpy()
                        for name, param in net.named_parameters()
                    }
                }

            train_losses.append(loss.item())

        current_learning_rate = optim.param_groups[0]["lr"]

        scheduler.step()

        train_loss = np.array(train_losses).mean()

        if verbose:
            print(f"[Epoch: {i_epoch}] Train loss: {train_loss}")

        net.eval()
        with torch.no_grad():
            test_losses = []
            for input, target in test_loader:
                output = net.forward(input)
                loss = torch.nn.functional.mse_loss(output, target)

                test_losses.append(loss.item())

            test_loss = np.array(test_losses).mean()
            if verbose:
                print(f"[Epoch: {i_epoch}] Test loss: {test_loss}")

        metrics.append(
            {
                "epoch": i_epoch,
                "train_loss": train_loss,
                "test_loss": test_loss,
                "model_debug_data": model_debug_data,
                "current_learning_rate": current_learning_rate
            }
        )
    return {
        "metrics": pd.DataFrame(metrics),
        "net": net,
    }
In [ ]:
def viz_training(training_bundle, num_pred: int = 1):
    net: torch.nn.Module = training_bundle["net"]
    metrics_df: pd.DataFrame = training_bundle["metrics"]

    fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(20, 20))

    loss_ax, pred_vs_true_ax, gradient_ax, weight_ax = axes.ravel()

    loss_ax.plot(metrics_df["epoch"], metrics_df["train_loss"], label="train_loss")
    loss_ax.plot(metrics_df["epoch"], metrics_df["test_loss"], label="test_loss")
    loss_ax.set_xlabel("Epoch")
    loss_ax.set_ylabel("Loss")
    loss_ax.legend()

    pred_vs_true_ax.scatter(X_train_numpy.ravel(), y_train_numpy, label="train true", alpha=.1)
    pred_vs_true_ax.scatter(X_test_numpy.ravel(), y_test_numpy, label="test true", alpha=.1)
    net.eval()
    with torch.no_grad():
        preds_train_x = []
        preds_train_y = []
        preds_test_x = []
        preds_test_y = []
        for _ in range(num_pred):
            preds_train_x.append(X_train_numpy.ravel())
            preds_train_y.append(net.forward(X_train).cpu().numpy().ravel())

            preds_test_x.append(X_test_numpy.ravel())
            preds_test_y.append(net.forward(X_test).cpu().numpy().ravel())
        
        pred_vs_true_ax.scatter(np.concatenate(preds_train_x), np.concatenate(preds_train_y), label="train pred", alpha=.5)
        pred_vs_true_ax.scatter(np.concatenate(preds_test_x), np.concatenate(preds_test_y), label="test pred", alpha=.5)


    for source, ax in zip(["gradients", "weights"], [gradient_ax, weight_ax]): 
        gradient_df = (
            training_bundle["metrics"]
            .loc[lambda df: df["model_debug_data"].notna()]
            .assign(
                temp=lambda df: [
                    {"epoch": epoch} | model_debug_data[source]
                    for epoch, model_debug_data
                    in zip(df["epoch"], df["model_debug_data"])
                ]
            )
            ["temp"].apply(pd.Series)
            .set_index("epoch")
            .assign(
                **{
                    colname: (
                        lambda df, colname=colname: [
                            np.power(vec.ravel(), 2.).mean()
                            for vec in df[colname]
                        ]
                    )
                    for colname in list(training_bundle["metrics"]["model_debug_data"].iloc[0][source].keys())
                }
            )
        )
        for col in gradient_df.columns:
            ax.plot(gradient_df.index.to_numpy(), gradient_df[col].to_numpy(), label=col)
        
        ax.set_title(source)
        ax.legend()
    
    
    pred_vs_true_ax.legend()
In [ ]:
training_bundle_default = train_net(
    net_description={"activation": "gelu", "dropout_p": .5, "dropout_type": "regular"},
    random_seed=42,
    num_epochs=300,
    batch_size=1000,
    learning_rate=1e-2,
    gamma=.99,
    model_debug_data_save_frequency=1,
)
viz_training(training_bundle=training_bundle_default)
[Epoch: 0] Train loss: 1.4952805953867294
[Epoch: 0] Test loss: 1.1796560525894164
[Epoch: 50] Train loss: 0.5256583147189197
[Epoch: 50] Test loss: 0.4941104888916016
[Epoch: 100] Train loss: 0.5212734376682955
[Epoch: 100] Test loss: 0.48921377062797544
[Epoch: 150] Train loss: 0.5190125949242536
[Epoch: 150] Test loss: 0.48906358480453493
[Epoch: 200] Train loss: 0.516857880003312
[Epoch: 200] Test loss: 0.48817660808563235
[Epoch: 250] Train loss: 0.5187382540282082
[Epoch: 250] Test loss: 0.4878509402275085
In [ ]:
training_bundle_other_seed = train_net(
    net_description={"activation": "gelu", "dropout_p": .5, "dropout_type": "regular"},
    random_seed=43,
    num_epochs=300,
    batch_size=1000,
    learning_rate=1e-2,
    gamma=.99,
    model_debug_data_save_frequency=1,
)
viz_training(training_bundle=training_bundle_other_seed)
[Epoch: 0] Train loss: 0.763431563096888
[Epoch: 0] Test loss: 0.6667608618736267
[Epoch: 50] Train loss: 0.5220497355741613
[Epoch: 50] Test loss: 0.4903694033622742
[Epoch: 100] Train loss: 0.5193892997853896
[Epoch: 100] Test loss: 0.4885431408882141
[Epoch: 150] Train loss: 0.51900175038506
[Epoch: 150] Test loss: 0.48777252435684204
[Epoch: 200] Train loss: 0.5191797473851372
[Epoch: 200] Test loss: 0.4880248963832855
[Epoch: 250] Train loss: 0.5179159693858203
[Epoch: 250] Test loss: 0.48744412064552306
In [ ]:
training_bundle_no_dropout = train_net(
    net_description={"activation": "gelu", "dropout_type": "None"},
    random_seed=43,
    num_epochs=300,
    batch_size=1000,
    learning_rate=1e-2,
    gamma=.99,
    model_debug_data_save_frequency=1,
)
viz_training(training_bundle=training_bundle_no_dropout)
[Epoch: 0] Train loss: 0.7599048018455505
[Epoch: 0] Test loss: 0.665152621269226
[Epoch: 50] Train loss: 0.5133729752372292
[Epoch: 50] Test loss: 0.48799843788146974
[Epoch: 100] Train loss: 0.5112034678459167
[Epoch: 100] Test loss: 0.48818291425704957
[Epoch: 150] Train loss: 0.5117473076371586
[Epoch: 150] Test loss: 0.4866418719291687
[Epoch: 200] Train loss: 0.5127150083289427
[Epoch: 200] Test loss: 0.48709317445755007
[Epoch: 250] Train loss: 0.5107313131584841
[Epoch: 250] Test loss: 0.4867681682109833
In [ ]:
training_bundle_mc = train_net(
    net_description={"activation": "gelu", "dropout_p": .5, "dropout_type": "monte_carlo"},
    random_seed=42,
    num_epochs=300,
    batch_size=1000,
    learning_rate=1e-2,
    gamma=.99,
    model_debug_data_save_frequency=1,
)
viz_training(training_bundle=training_bundle_mc)
[Epoch: 0] Train loss: 1.4952805953867294
[Epoch: 0] Test loss: 1.1811462044715881
[Epoch: 50] Train loss: 0.5257831724251018
[Epoch: 50] Test loss: 0.5057389855384826
[Epoch: 100] Train loss: 0.5239530679057626
[Epoch: 100] Test loss: 0.49976211190223696
[Epoch: 150] Train loss: 0.5209505820975584
[Epoch: 150] Test loss: 0.4927620947360992
[Epoch: 200] Train loss: 0.518083945793264
[Epoch: 200] Test loss: 0.4936288416385651
[Epoch: 250] Train loss: 0.5198071038021761
[Epoch: 250] Test loss: 0.4971176326274872
In [ ]:
viz_training(training_bundle=training_bundle_mc, num_pred=10)
In [ ]:
training_bundle_mc_other_seed = train_net(
    net_description={"activation": "gelu", "dropout_p": .5, "dropout_type": "monte_carlo"},
    random_seed=43,
    num_epochs=300,
    batch_size=1000,
    learning_rate=1e-2,
    gamma=.99,
    model_debug_data_save_frequency=1,
)
viz_training(training_bundle=training_bundle_mc_other_seed)
[Epoch: 0] Train loss: 0.763431563096888
[Epoch: 0] Test loss: 0.6650225281715393
[Epoch: 50] Train loss: 0.522653192281723
[Epoch: 50] Test loss: 0.4953924059867859
[Epoch: 100] Train loss: 0.5203518446754006
[Epoch: 100] Test loss: 0.49757230281829834
[Epoch: 150] Train loss: 0.5198933622416329
[Epoch: 150] Test loss: 0.4940872311592102
[Epoch: 200] Train loss: 0.5204210474210627
[Epoch: 200] Test loss: 0.490051406621933
[Epoch: 250] Train loss: 0.5169882458799026
[Epoch: 250] Test loss: 0.4932470917701721